
pacman::p_load(tidyverse,
               fst,
               tidymodels,
               assertthat,
               doParallel,
               embed,
               glue,
               fs,
               tictoc,
               furrr,
               lubridate,
               glmnet,
               glmnetUtils
)

Add_Year_and_Default <- function(df_raw_data) {
  
  df_raw_data %>%
    mutate(
      year = year(qtr),
      year_outcome = (10 * year) + t_default,
      t_default = factor(t_default, levels = c(0, 1))
    )
}


# Files -------------------------------------------------------------------

test_qtrs <- seq.Date(as.Date(TEST_START_QTR), as.Date(TEST_END_QTR), by = "quarter")

file_info <- tibble(file_name = dir_ls("../../data_qtr_rand_no_cid_with_outcome//")) %>% 
  mutate(
    qtr = ymd(str_extract(file_name, "[0-9]{8}")),
    rand_no_cid = str_extract(file_name, "[0-9]{4}(?=.fst)")
  )

test_files <- file_info %>% 
  filter(qtr %in% test_qtrs, rand_no_cid %in% RAND_NO_CID) %>% 
  mutate(
    qtr_rand_no_cid = paste0(str_replace_all(qtr, "-", ""), "_", rand_no_cid)
  ) 
  
path_smbinning <- paste0("../../data/pipeline_outputs/", 
                         SPECIAL_SUFFIX, "/", "smbinning_", TRAINING_SUFFIX,
                         "_", RAND_NO_CID_SMALLEST_LARGEST, "/")
 
are_equal(unique(test_files$rand_no_cid), RAND_NO_CID) %>% 
  stopifnot("Desired test IDs not present in pulled files" = .)


# Model Components --------------------------------------------------------

recipe_binning <- readRDS(paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "final_model_logistic_",
                                 TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, "/",
                                 "discretization_recipe.rds"))

optimal_penalty <- read_csv(paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "final_model_logistic_",
                                   TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, "/", 
                                   "optimal_hyperparameters.csv")) %>% 
  pull(lambda)

fit_logistic <- readRDS(paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "final_model_logistic_",
                               TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, "/",
                               "final_model.rds"))

v_var_less_than_5_unique <- read_csv(paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "train_binned_", 
                                             TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/",
                                             "vars_less_than_5_unique.csv")) %>% 
  pull(Variable)

v_numeric_attr <- read_csv(paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "train_binned_", 
                                  TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/", "numeric_predictors.csv")) %>% 
  pull(Variable)

v_non_numeric_attr <- read_csv(paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "train_binned_", 
                                      TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/", "non_numeric_predictors.csv")) %>% 
  pull(Variable)

v_lasso_vars <- read_csv(paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, 
                                "/lasso_selected_vars_", TRAINING_SUFFIX, "_",
                                RAND_NO_CID_SMALLEST_LARGEST, "/",
                                "df_lasso_vars_and_dict.csv")) %>% 
  pull(Variable)


# Script ------------------------------------------------------------------

df_raw <- test_files %>% 
  mutate(
    data = map(file_name, ~read_fst(., columns = c("cid", "qtr", "t_default", 
                                         all_of(v_lasso_vars), "consumer_category")))
  ) %>% 
  select(rand_no_cid, data) %>% 
  unnest(cols = data) %>% 
  glimpse()

df <- df_raw %>% 
  Add_Year_and_Default() %>% 
  glimpse()


df_factor_processing <- df %>% 
  #select(c("cid", "qtr", "t_default", "rand_no_cid", all_of(v_lasso_vars))) %>% 
  mutate(
    across(all_of(v_var_less_than_5_unique), ~ as.factor(.)),
    across(.cols = all_of(v_numeric_attr), ~ ifelse(is.na(.), -999999, .)),
    across(.cols = all_of(v_non_numeric_attr), ~ fct_explicit_na(., "missing"))
  ) %>% 
  glimpse()


v_numeric_attr <- df_factor_processing %>% 
  select(contains("attr")) %>% 
  select(where(~ is.numeric(.))) %>% 
  names()

v_var_smbinning <- dir_ls(path_smbinning) %>% 
  str_extract("(?<=/)[^/]+(?=\\.rds)") 

v_to_bin <- intersect(v_lasso_vars, v_var_smbinning)


for (i in 1:length(v_to_bin)) {
  
  var_ <- v_to_bin[i]
  
  file_name_smb <- dir_ls(path_smbinning) %>% 
    str_subset(glue("{var_}.rds"))
  
  smbinning_result <- read_rds(file_name_smb)
  
  col_ <- df_factor_processing[, var_][[1]]
  
  
  if(all(smbinning_result == "No significant splits")) {
    
    if (n_distinct(df_factor_processing[, var_]) < 5) {
      df_factor_processing[, var_] <- as.character(col_) 
    } else {
      cuts_ <- unique(c(-Inf, quantile(col_), Inf))
      df_factor_processing[, var_] <- cut(col_, cuts_, right = T, include.lowest = T) 
    }
  } else {
    
    cuts_ <- c(-Inf, smbinning_result$cuts, Inf)
    df_factor_processing[, var_] <- cut(col_, cuts_, right = T, include.lowest = T) 
    
  }
  print(glue("{i}/{length(v_numeric_attr)}"))
  
}

df_test_baked <- bake(recipe_binning, df_factor_processing)

v_fitted <- as.numeric(predict(fit_logistic, df_test_baked, type = "response", s = optimal_penalty))


df_rand_no_qtr <- df_test_baked %>% 
  distinct(rand_no_cid, qtr) %>% 
  mutate(
    qtr_rand_no_cid = glue("{str_remove_all(qtr, '-')}_{rand_no_cid}")
  )

df_fitted_all <- df_test_baked %>%
  mutate(
    v_fitted = v_fitted
  )

for (i in 1:dim(df_rand_no_qtr)[1]) {
  
  qtr_ <- df_rand_no_qtr$qtr[i]
  rand_no_cid_ <- df_rand_no_qtr$rand_no_cid[i]
  qtr_rand_no_cid_ <- df_rand_no_qtr$qtr_rand_no_cid[i]

  df_fitted <- df_fitted_all %>% 
    filter(qtr == qtr_, rand_no_cid == rand_no_cid_) %>% 
    select(
      cid,
      qtr,
      v_fitted
    )
  
  write_fst(df_fitted, paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "fitted_logistic_",
                              SUFFIX_FITTED_, "/" , "fitted_", qtr_rand_no_cid_, ".fst"))
  
}




